Libraries

suppressPackageStartupMessages({
  # Load required packages
  library(SingleCellExperiment)
  library(dplyr)
  library(magrittr)
  library(janitor)
  library(tidyr)
  library(stringr)
  library(readxl)
  library(data.table)
  library(cowplot)
  library(ComplexHeatmap)
  library(circlize)
  library(scran)
  library(cutpointr)
  library(glmnet)
  library(ClassifyR)
  library(survival)
  library(survminer)
  library(ggthemr)
  source("benchmarkUtils.R")
})

rename <- dplyr::rename
select <- dplyr::select

# Set ggplot theme
ggthemr_reset()
ggthemr('pale')
mycolors <- c("#999999", "#0072B2", "#E69F00", "#F0E442", "#D9B3FF", "#009E73",  
              "#D55E00", "#5D8AA8", "#CC79A7", "#56B4E9",
              "#F3B3A6", "#A5AB81", "#B2182B", "#4393C3", "#CDBE6B", 
              "#80CDC1", "#F4A582", "#BABABA", "#CCEBC5", "#DECBE4",
              "#FDDFDF", "#B3DE69", "#FDBF6F", "#CCECE6", "#FB8072")

dl_method_pal <- c(
  "TCGN" = "#CA9C91", 
  "THItoGene" ="#BA7FB5", 
  "EGNv1" = "#B3D46B", 
  "EGNv2" =  "#F7CBDF",
  "DeepPT"="#80B1D2", 
  "DeepSpaCE"="#F18072", 
  "ST-Net"="#8CD0C3",
  "HisToGene"="#f1c232",
  "Hist2ST"="#F9B063",
  "GeneCodeR"="#BCB9D8",
  "RNA-Seq"="#BABABA",
  "RNA-Seq-STgenes"="#BABABA"
  )

mypalette <- define_palette(
  swatch = mycolors,
  gradient = c(lower = mycolors[1L], upper = mycolors[3L])
)

ggthemr(mypalette) # for some reason it uses the first colour as the colour of the grid lines
theme_update(panel.grid.major = element_line(linetype="dotted"))

th <-   theme(text=element_text(size=15),
                axis.text.x = element_text(angle = 0, hjust = 0.5),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.background = element_rect(colour = "black", size=0.7, fill=NA) )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Data Loading

# load the dataset
tcga_rnaseq_data <- readRDS(
  file="data/raw/tcga_brca/tcga_breast_matched_rnaseq_sce.rds"
)
# Get calculated metrics
pred_feat_cor <- readRDS("data/processed/her2st/her2st_pred_feat_cor_11.rds") %>%
  mutate(model_id = case_when(
    model_id == "genecoder_i500_j500" ~ "GeneCodeR",
    TRUE ~ model_id
  )) %>%
  mutate(ssim = ifelse(ssim < 0, 0, ssim))

gene_cols <- pred_feat_cor %>% distinct(gene) %>% pull

# Get TNBC Patients
tcga_clinical_full <- read.delim("data/raw/tcga_brca/BRCA.clin.merged.txt", header=FALSE) %>%
  t() %>%
  as.data.table() %>%
  row_to_names(1)

tnbc_pats <- tcga_clinical_full %>%
  filter(`patient.lab_proc_her2_neu_immunohistochemistry_receptor_status`=="negative" &
  patient.breast_carcinoma_estrogen_receptor_status=="negative" &
  patient.breast_carcinoma_progesterone_receptor_status=="negative") %>%
  pull(patient.bcr_patient_barcode) %>%
  toupper()

# Calculate HV Genes
dec_tcga <- modelGeneVar(assay(tcga_rnaseq_data, "normalized_count")[which(rownames(assay(tcga_rnaseq_data, "normalized_count")) %in% gene_cols),])
# Get the top 10% of genes.
hv_genes_tcga <- getTopHVGs(dec_tcga, fdr.threshold = .05)

Load data

The following is a dataframe with the following columns:

tcga_pred_comb_dat <- read.csv('data/raw/tcga_brca/her2st_tcga_pred_comb_dat_mean.csv')
glimpse(tcga_pred_comb_dat)
## Rows: 4,947,145
## Columns: 7
## $ img_id   <chr> "TCGA-4H-AAAK-01Z-00-DX1", "TCGA-5L-AAT0-01Z-00-DX1", "TCGA-5…
## $ gene     <chr> "HPS6", "HPS6", "HPS6", "HPS6", "HPS6", "HPS6", "HPS6", "HPS6…
## $ pred     <dbl> 0.1395431, 0.1476364, 0.1347771, 0.1544416, 0.1290884, 0.1219…
## $ model_id <chr> "HisToGene", "HisToGene", "HisToGene", "HisToGene", "HisToGen…
## $ pat_id   <chr> "TCGA-4H-AAAK", "TCGA-5L-AAT0", "TCGA-5L-AAT1", "TCGA-5T-A9QA…
## $ rna_id   <chr> "TCGA-4H-AAAK-01A-12R-A41B-07", "TCGA-5L-AAT0-01A-12R-A41B-07…
## $ exprs    <dbl> 570.2128, 870.5089, 722.9014, 463.9121, 591.1835, 880.7978, 6…

Patient-level Correlation

Boxplot of patient-level correlations between predicted pseudobulk gene expression and RNA-Seq bulk gene expression over all analysed TCGA-BRCA images split by method. Models were trained on the HER2+ ST dataset, selecting the model from the 4-fold CV with the best test set performance.

tcga_pred_pat_m_df <- tcga_pred_comb_dat %>%
  filter(log(exprs) > 5) %>%
  mutate(pred = log(ifelse(pred < 0, 0, pred)+1)) %>%
  mutate(exprs = log(exprs)) %>%
  filter(!is.na(pred)) %>%
  group_by(pat_id, img_id, model_id) %>%
  summarise(
    cor_pearson = cor(exprs, pred, method = "pearson"),
    cor_spearman = cor(exprs, pred, method = "spearman"),
    var_exprs = var(exprs),
    var_pred = var(pred),
    mean_exprs = mean(exprs),
    mean_pred = mean(pred),
    rmse = sqrt(sum((pred - exprs) ^ 2) / n()),
    mi = calculate_MI(pred, exprs),
    js_div = calculate_JS_divergence(pred, exprs),
    auc_0 = calculate_AUC(pred, exprs, 0),
    nrmse_range = calculate_nrmse(pred, exprs, "range"),
    nrmse_sd = calculate_nrmse(pred, exprs, "sd"),
    .groups = "drop"
  )

Figure 4b

tcga_pred_pat_m_df <- tcga_pred_pat_m_df%>%
  arrange(factor(model_id, levels=c("ST-Net", "HisToGene", "GeneCodeR", "DeepSpaCE", "DeepPT", "Hist2ST", "EGNv1", "EGNv2", "TCGN", "THItoGene")))
tcga_pred_pat_m_df$model_id <- factor(tcga_pred_pat_m_df$model_id, levels = unique(tcga_pred_pat_m_df$model_id))

p_pat_cor <- tcga_pred_pat_m_df %>%
  ggplot()+
  aes(x=model_id,y=cor_pearson, col=model_id, fill=model_id)+
  geom_violin(alpha=0.2)+
  geom_boxplot(alpha=0.65, width=0.3)+
  labs(y="PCC", x="")+
  scale_color_manual(values=dl_method_pal)+
  scale_fill_manual(values=dl_method_pal)+
  theme(legend.position="none") + 
  th

p_pat_cor

HistoQC - Relationship with image QC

Scatterplot of patient-level correlations between predicted pseudobulk gene expression and RNA-Seq bulk gene expression vs. first principal component of calculated H&E quality control metrics. H&E QC metrics were calculated on stage I breast cancer patients. Line of best fit plotted along with Pearson correlation and associated p-value from a correlation test.

Figure 4a

tcga_her2_qc_dat <- read.delim("data/raw/qc_tcga_her2_results_first.tsv",
                               sep = "\t", header=TRUE, skip=5, row.names = NULL) %>%
  `colnames<-`(colnames(.)[c(2:ncol(.), 1)]) %>%
  dplyr::select(-comments, -row.names) %>%
  rename(filename=X.dataset.filename)

tcga_her2_qc_stand <- read.delim("data/raw/qc_tcga_her2_results_standard.tsv",
                               sep = "\t", header=TRUE, skip=5, row.names = NULL) %>%
  `colnames<-`(colnames(.)[c(2:ncol(.), 1)]) %>%
  dplyr::select(-comments, -row.names) %>%
  rename(filename=X.dataset.filename)

tcga_her2_qc_df <- tcga_her2_qc_dat %>%
  mutate(pat_id = gsub(".tiff", "", filename)) %>%
  dplyr::select(
    c(
      michelson_contrast ,
      rms_contrast , 
      grayscale_brightness,     
      chan1_brightness, 
      chan2_brightness, 
      chan3_brightness,
      chan1_brightness_YUV,
      chan2_brightness_YUV,
      chan3_brightness_YUV, pat_id))

tcga_her2_qc_stand_df <- tcga_her2_qc_stand %>%
  mutate(pat_id = gsub(".tiff", "", filename)) %>%
  dplyr::select(c(pat_id))

merge_her2_qc <- merge(tcga_her2_qc_df, tcga_her2_qc_stand_df, by = "pat_id")

pca_res <- prcomp(merge_her2_qc %>% 
                    select(-pat_id),
                  center=TRUE,
                  scale=TRUE)

p_pc_pat_cor <- data.frame(
  PC1=pca_res$x[,"PC1"],
  pat_id=merge_her2_qc %>% select(pat_id)
) %>%
  left_join(
    tcga_pred_pat_m_df %>% 
      select(pat_id, model_id, cor_pearson),
    by="pat_id"
  ) 

p_pc_pat_cor_res <- p_pc_pat_cor %>%
  filter(PC1 < 4) %>%
  group_by(model_id) %>%
  group_modify(~{
    tidy(cor.test(.x$cor_pearson, .x$PC1)) %>%
      mutate(n = nrow(.x))
  }) %>%
  mutate(
  report = sprintf(
    "italic(t)~(%d)~'='~%.2f*','~italic(p)~'='~%.2g~','~italic(r)~'='~%.2f*','~'95%%~CI'~'['~%.2f*','~%.2f~']'",
    n - 2, statistic, p.value, estimate, conf.low, conf.high
  )
)

p_tcga_histoqc <- p_pc_pat_cor %>% 
  filter(PC1 < 4) %>% 
  ggplot() +
  aes(x = PC1, y = cor_pearson) +
  geom_point() +
  geom_smooth(data = subset(p_pc_pat_cor, PC1 < 4), 
              method = "lm", formula = "y~x", alpha = .1) +
  geom_text(
    data = p_pc_pat_cor_res,
    aes(label = report, x = Inf, y = Inf),
    hjust = 1, vjust = 1.2, size = 2, parse = TRUE, col = "grey20"
  ) +
  facet_wrap(~model_id, nrow = 1, scales = "free_y") +
  labs(
    x = "HistoQC features",
    y = "PCC",
    title = "Correlation Analysis with HistoQC Features"
  ) +
  th

p_tcga_histoqc

Prognostic Models in Breast Cancer Subtypes

Xu J, Qin S, Yi Y, Gao H, Liu X, Ma F, Guan M. Delving into the Heterogeneity of Different Breast Cancer Subtypes and the Prognostic Models Utilizing scRNA-Seq and Bulk RNA-Seq. International Journal of Molecular Sciences. 2022; 23(17):9936. https://doi.org/10.3390/ijms23179936

bc_pat_luminal <- tcga_clinical_full %>%
  filter( (patient.breast_carcinoma_estrogen_receptor_status=="positive" &
              patient.breast_carcinoma_progesterone_receptor_status=="positive") |  
             (patient.breast_carcinoma_estrogen_receptor_status=="positive" &
                patient.breast_carcinoma_progesterone_receptor_status=="negative")) %>%
  pull(patient.bcr_patient_barcode) %>%
  toupper()

bc_pat_her2 <- tcga_clinical_full %>%
  filter( `patient.lab_proc_her2_neu_immunohistochemistry_receptor_status`=="positive" 
          ) %>% 
  pull(patient.bcr_patient_barcode) %>%
  toupper()

bc_pat_tnbc <- tcga_clinical_full %>%
  filter(`patient.lab_proc_her2_neu_immunohistochemistry_receptor_status`=="negative" &
           patient.breast_carcinoma_estrogen_receptor_status=="negative" &
           patient.breast_carcinoma_progesterone_receptor_status=="negative") %>% 
  pull(patient.bcr_patient_barcode) %>%
  toupper
# Get data
# Baseline Survival Prediction using TCGA
dpnd_var_df <- colData(tcga_rnaseq_data) %>%
  as.data.frame() %>%
  select(vital_status, days_to_collection, days_to_last_follow_up, ajcc_pathologic_stage, 
         days_to_death, patient,
         paper_vital_status, paper_days_to_last_followup, paper_pathologic_stage) %>%
  tibble::rownames_to_column("rna_id")

# Consider only the samples in the predicted data
tcga_dl_samp_id <- tcga_pred_comb_dat %>%
  distinct(rna_id) %>% 
  pull

tcga_samp_id <- dpnd_var_df %>%
  tibble::rownames_to_column() %>%
  filter(rna_id  %in% (tcga_dl_samp_id)) %>%
  pull(rna_id)

task_data <- bind_cols(
  t(assay(tcga_rnaseq_data, "normalized_count")) %>%
    as.data.frame(),
  dpnd_var_df %>%
    select(patient, vital_status, days_to_last_follow_up, days_to_death) %>%
    mutate(days_to_last_follow_up = ifelse(is.na(days_to_death),
                                           days_to_last_follow_up,
                                           days_to_death)) %>%
    select(-days_to_death)
) %>%
  mutate(vital_status =  as.integer(as.factor(vital_status)) - 1) %>%
  filter(!is.na(vital_status)) %>%
  janitor::clean_names()

# Clean data by removing lowest 20% absolute exp and lowest 10% variance
# Calculate the absolute expression values for each gene
exp_sum <- task_data %>%
  select(-c(patient, vital_status, days_to_last_follow_up)) %>%
  apply(., 2, sum)

# Calculate variance for each gene across samples
exp_var <- task_data %>%
  select(-c(patient, vital_status, days_to_last_follow_up)) %>%
  apply(., 2, var)

# Determine threshold values for selecting genes
abs_threshold <- quantile(exp_sum, 0.2)
variance_threshold <- quantile(exp_var, 0.1)


# Filter genes based on criteria
task_data_p <- task_data %>%
  select(
    c(patient, vital_status, days_to_last_follow_up), 
    all_of(names(which(exp_sum > abs_threshold & exp_var > variance_threshold)))
  ) %>%
  filter(days_to_last_follow_up > 0) %>%
  mutate_at(vars(-c("patient","vital_status","days_to_last_follow_up")),
            function(col) log(col+1))

model_names <- tcga_pred_comb_dat %>%
  distinct(model_id) %>%
  pull()

getSurvPredGEDatList <- function(pat_subset) {
  lapply(model_names,
       function(model_name) {
         tsk_dat <- dpnd_var_df %>%
           select(patient, rna_id, vital_status, days_to_last_follow_up, days_to_death) %>%
           mutate(days_to_last_follow_up = ifelse(is.na(days_to_death),
                                                  days_to_last_follow_up,
                                                  days_to_death)) %>%
           select(-days_to_death) %>% 
           filter(patient %in% pat_subset) %>%
           select(-patient) %>%
           left_join(
             tcga_pred_comb_dat %>%
               filter(model_id == model_name) %>%
               select(-exprs) %>%
               mutate(pred = ifelse(pred < 0, 0, pred)) %>%
               mutate(pred = log(pred+1)) %>% 
               # do mean normalisation
               group_by(pat_id, model_id) %>%
               mutate(pred = pred - mean(pred)) %>%
               pivot_wider(names_from = "gene",
                           values_from = "pred"),
             by = c("rna_id" = "rna_id")
           ) %>%
           tibble::column_to_rownames(var="rna_id") %>%
           select(-c(img_id, model_id, pat_id)) %>%
           mutate(vital_status =  as.integer(as.factor(vital_status))-1) %>%
           # Remove NA in dependent var
           filter(!is.na(vital_status)) %>%
           filter(!is.na(days_to_last_follow_up) & days_to_last_follow_up > 0) %>%
           filter(!is.na(HPS6)) %>%
           janitor::clean_names()
         
         return(tsk_dat)
       })
  }

calcSurvRisk <- function(surv_data) {
  # Univariate Cox regression, get p-values
  # Initialize an empty data frame to store p-values
  p_value_df <- data.frame(gene = character(0), p_value = numeric(0))
  
  # Loop through each gene column
  for (gene_col in colnames(surv_data)[!(colnames(surv_data) %in% 
                                         c("vital_status", "days_to_last_follow_up"))]) {
    # Fit a univariate Cox regression model
    cox_model <- coxph(Surv(days_to_last_follow_up, vital_status) ~ get(gene_col), 
                       data = surv_data)
    
    # Extract the p-value and store it in the p_values data frame
    p_value <- summary(cox_model)$coefficients[, "Pr(>|z|)"]
    p_value_df <- rbind(p_value_df, data.frame(gene = gene_col, p_value = p_value))
  }
  #p_value_df <- data.frame(p_value_df)
  # Filter genes with p-value < 0.05
  if (sum(p_value_df$p_value < 0.005,na.rm=T) < 5) {
    sign_uni_cox_genes <- p_value_df %>%
      arrange(p_value) %>%
      dplyr::slice(1:10)
  } else {
    sign_uni_cox_genes <- p_value_df[p_value_df$p_value < 0.005, ] %>% 
      filter(!is.na(p_value))
  }
  # Prepare the input data for Lasso regression
  X <- surv_data[, sign_uni_cox_genes$gene, drop = FALSE]
  Y <- surv_data[, c("days_to_last_follow_up", "vital_status")]
  
  # Fit a Lasso regression model
  lasso_model <- cv.glmnet(as.matrix(X), Surv(Y$days_to_last_follow_up, Y$vital_status), 
                           alpha = 1, family = "cox")
  
  # Extract the selected genes based on Lasso regularization,
  # get the model with coefficients >= 8
  if (any(lasso_model$nzero >= 8)) {
    selected_genes <- coef(lasso_model, s = lasso_model$lambda[which(lasso_model$nzero >= 9)[1]-1])
  } else {
    selected_genes <- coef(lasso_model, s = lasso_model$lambda[which.max(lasso_model$nzero)])
  }
  
  # Prepare the input data for multivariate Cox regression
  X_selected <- X[, rownames(selected_genes)[selected_genes@i], drop = FALSE]
  
  # Fit a multivariate Cox regression model
  multivariate_cox_model <- coxph(Surv(Y$days_to_last_follow_up, Y$vital_status) ~ .,
                                  data = X_selected)
  
  # Calculate the risk scores for each patient
  patient_risk <- exp(coef(multivariate_cox_model) %*% t(as.matrix(X_selected)))
  # Calculate the median risk score
  median_pat_risk <- median(patient_risk)
  
  pat_risk_df <- patient_risk %>% 
    t() %>% 
    as.data.frame() %>%
    tibble::rownames_to_column("rna_id") %>%
    rename(risk=V1) %>% 
    mutate(risk_cat = ifelse(risk >= median_pat_risk,"high-risk","low-risk")) %>%
    bind_cols(Y)
  
  surv_obj <- Surv(pat_risk_df$days_to_last_follow_up, pat_risk_df$vital_status)
  
  surv_fit = survfit(
    Surv(days_to_last_follow_up, vital_status) ~ risk_cat,
    data = pat_risk_df
    )
  
  p_km_surv <- ggsurvplot(
    fit = surv_fit,
    data = pat_risk_df,
    risk.table = FALSE,
    surv.median.line = "hv",
    pval = TRUE,
    # pval.method = TRUE,
    pval.coord = c(0, 0.25),
    conf.int = TRUE,
    legend.title = "Risk Group",
    xlab = "Time (years)",
    ylab = "Survival Probability",
    title = "Kaplan-Meier Survival Curves by Risk Group"
  )
  
  logrank_test <- survdiff(
    Surv(days_to_last_follow_up, vital_status) ~
      risk_cat, 
    data = pat_risk_df)
  
  return(list(
    p_value_df=p_value_df,
    pat_risk_df=pat_risk_df,
    p_km_surv=p_km_surv,
    logrank_test=logrank_test,
    cox_model=multivariate_cox_model
  ))
}
#' RNA-Seq risk models
task_data_p_luminal <- task_data_p %>%
  filter(patient %in% bc_pat_luminal) %>%
  select(-patient)

task_data_p_her2 <- task_data_p %>%
  filter(patient %in% bc_pat_her2) %>%
  select(-patient)

task_data_p_tnbc <- task_data_p %>%
  filter(patient %in% bc_pat_tnbc) %>%
  select(-patient)

task_data_p_luminal_stgenes <- task_data_p %>%
  filter(patient %in% bc_pat_luminal) %>%
  select(-patient) %>%
  select(vital_status, days_to_last_follow_up, any_of(make_clean_names(gene_cols)))

task_data_p_her2_stgenes <- task_data_p %>%
  filter(patient %in% bc_pat_her2) %>%
  select(-patient) %>%
  select(vital_status, days_to_last_follow_up, any_of(make_clean_names(gene_cols)))

task_data_p_tnbc_stgenes <- task_data_p %>%
  filter(patient %in% bc_pat_tnbc) %>%
  select(-patient) %>%
  select(vital_status, days_to_last_follow_up, any_of(make_clean_names(gene_cols)))

#' Predicted GE risk models
# Luminal subset for predicted GE
surv_tasks_dat_list_luminal <- getSurvPredGEDatList(pat_subset=bc_pat_luminal)
surv_tasks_dat_list_her2 <- getSurvPredGEDatList(pat_subset=bc_pat_her2)
surv_tasks_dat_list_tnbc <- getSurvPredGEDatList(pat_subset=bc_pat_tnbc)


surv_dat_list <- c(
  list(task_data_p_luminal,   task_data_p_luminal_stgenes),
  surv_tasks_dat_list_luminal,
  list(task_data_p_her2,  task_data_p_her2_stgenes),
  surv_tasks_dat_list_her2,
  list(task_data_p_tnbc,
  task_data_p_tnbc_stgenes),
  surv_tasks_dat_list_tnbc
) %>%
  `names<-`(
    c("RNA-Seq_luminal",
    "RNA-Seq-STgenes_luminal",
    sprintf("%s_luminal", model_names),
    "RNA-Seq_her2",
    "RNA-Seq-STgenes_her2",
    sprintf("%s_her2", model_names),
    "RNA-Seq_tnbc",
    "RNA-Seq-STgenes_tnbc",
    sprintf("%s_tnbc", model_names))
  )
pred_ge_surv_risk_luminal <- lapply(
  surv_tasks_dat_list_luminal, 
  calcSurvRisk
  ) %>%
  `names<-`(model_names)

pred_ge_surv_risk_her2 <- lapply(
  seq_along(surv_tasks_dat_list_her2), 
  function(i) {
    calcSurvRisk(surv_data=surv_tasks_dat_list_her2[[i]])
  }) %>%
  `names<-`(model_names)

pred_ge_surv_risk_tnbc <- lapply(
    seq_along(surv_tasks_dat_list_tnbc), 
  function(i) {
    calcSurvRisk(surv_tasks_dat_list_tnbc[[i]])
  }) %>%
  `names<-`(model_names)

gt_ge_surv_risk_luminal <- calcSurvRisk(task_data_p_luminal)
gt_ge_surv_risk_her2 <- calcSurvRisk(task_data_p_her2)
gt_ge_surv_risk_tnbc <- calcSurvRisk(task_data_p_tnbc)
gt_ge_stgenes_surv_risk_luminal <- calcSurvRisk(task_data_p_luminal_stgenes)
gt_ge_stgenes_surv_risk_her2 <- calcSurvRisk(task_data_p_her2_stgenes)
gt_ge_stgenes_surv_risk_tnbc <- calcSurvRisk(task_data_p_tnbc_stgenes)

surv_risk_list <- c(
  list(gt_ge_surv_risk_luminal),
  list(gt_ge_stgenes_surv_risk_luminal),
  pred_ge_surv_risk_luminal,
  list(gt_ge_surv_risk_her2),
  list(gt_ge_stgenes_surv_risk_her2),
  pred_ge_surv_risk_her2,
  list(gt_ge_surv_risk_tnbc),
  list(gt_ge_stgenes_surv_risk_tnbc),
  pred_ge_surv_risk_tnbc
) %>%
  `names<-`(
    c("RNA-Seq_luminal",
    "RNA-Seq-STgenes_luminal",
    sprintf("%s_luminal", model_names),
    "RNA-Seq_her2",
    "RNA-Seq-STgenes_her2",
    sprintf("%s_her2", model_names),
    "RNA-Seq_tnbc",
    "RNA-Seq-STgenes_tnbc",
    sprintf("%s_tnbc", model_names))
  )

saveRDS(surv_risk_list,
        file="data/raw/tcga_brca/tcga_pred_ge_surv_risk_list_logrna_meannorm.rds")
surv_risk_list <- readRDS(file="data/raw/tcga_brca/tcga_pred_ge_surv_risk_list_logrna_meannorm.rds")

Train all using 5 top features

Check data

Supplementary Figure 17

Boxplot of gene expression values after transformation for each sample from the TNBC & HER2 subset of the TCGA data and for all genes that were measured in the HER2+ spatial transcriptomics dataset.

# Plot boxplot of all genes
surv_risk_top10feat_data <- lapply(
  seq_along(surv_dat_list),
  function(i) {
    surv_dat_list[[i]] %>%
      tibble::rownames_to_column("rna_id") %>%
      mutate(task_id = names(surv_dat_list)[i])
  }) %>%
  bind_rows() %>%
  pivot_longer(cols=!c("rna_id","task_id",
                       "vital_status","days_to_last_follow_up"),
               names_to="gene",
               values_to="exprs") %>%
  filter(!is.na(exprs)) %>%
  separate(task_id, c("data","subgroup"), sep="_")

supp_fig_surv_dat_tnbc <- surv_risk_top10feat_data %>%
  filter(subgroup=="tnbc") %>%
  ggplot()+
  aes(x=rna_id,y=exprs)+
  geom_boxplot()+
  facet_wrap(~data, scales="free")+
  theme(axis.text.x=element_blank(),
      axis.ticks.x=element_blank())+
  labs(title="Boxplots of all  genes in each data",
       subtitle="TNBC BC Subset")

supp_fig_surv_dat_tnbc

Supplementary Figure 18

supp_fig_surv_dat_her2 <- surv_risk_top10feat_data %>%
  filter(subgroup=="her2") %>%
  ggplot()+
  aes(x=rna_id,y=exprs)+
  geom_boxplot()+
  facet_wrap(~data, scales="free")+
  theme(axis.text.x=element_blank(),
      axis.ticks.x=element_blank())+
  labs(title="Boxplots of all  genes in each data",
       subtitle="HER2 BC Subset")

supp_fig_surv_dat_her2

CV - ClassifyR

set.seed(12)
system.time({
  surv_risk_classifyr_list_top5feat_cv_3fold100rep <- lapply(
    seq_along(surv_dat_list),
    function(i) {
      print(i)
      survCrossValidated <- crossValidate(
        surv_dat_list[[i]],
        c("days_to_last_follow_up", "vital_status"),
        nFeatures = c(5),
        nFolds = 3, nRepeats = 100,
        classifier="CoxPH",
        selectionMethod = "CoxPH",
        nCores = 40)
    }) %>%
    `names<-`(names(surv_dat_list))
})

saveRDS(surv_risk_classifyr_list_top5feat_cv_3fold100rep,
        file="data/raw/tcga_brca/final_surv_risk_classifyr_list_top5feat_cv_3fold100rep.rds")
surv_risk_classifyr_list_top5feat_cv_3fold100rep <- readRDS(
  file="data/raw/tcga_brca/final_surv_risk_classifyr_list_top5feat_cv_3fold100rep.rds")

surv_eval_dat <- surv_risk_classifyr_list_top5feat_cv_3fold100rep

surv_risk_classifyr_cindex <- lapply(
  seq_along(surv_eval_dat),
  function(i) {
    surv_eval_dat[[i]]@predictions %>%
      as.data.frame() %>%
      left_join(
        data.frame(
          sample = surv_eval_dat[[i]]@originalNames,
          truth = surv_eval_dat[[i]]@actualOutcome
        ),
        by=c("sample")
      ) %>%
      mutate(subset_name = names(surv_eval_dat)[i])
  } 
) %>%
  bind_rows() %>%
  left_join(
    colData(tcga_rnaseq_data) %>% 
      as.data.frame() %>% 
      select(barcode, paper_pathologic_stage),
    by=c("sample"="barcode")
  ) %>%
  mutate(time=as.numeric(gsub("\\+","",truth)),
         event=ifelse(grepl("\\+",truth),0,1)) %>%
  group_by(subset_name,permutation,fold) %>%
  group_modify(~{
    concordance(
      truth~risk,
      data=.x,
      reverse=TRUE)$concordance %>%
      as.data.frame() %>%
      rename("cindex"=".")
  }) %>%
  group_by(subset_name) %>% 
  mutate(cindex_avg=mean(cindex))

Plots

Figure 4c-e

C-indices of multivariate cox regression models predicting survival of TCGA-BRCA patients, using RNA-Seq bulk, RNA-Seq bulk using only genes present in HER2+ ST dataset, and the predicted pseudobulk from each method. C-indices were calculated from the test sets of a 3-fold CV with 100 repeats trained within HER2+, luminal and TNBC breast cancer clinical subtypes

# Plot cindex
p_cind_boxplot <- surv_risk_classifyr_cindex %>% 
  separate_wider_delim("subset_name", "_", names=c("data","subgroup")) %>%
  group_by(data, permutation, subgroup) %>%
  summarise(cindex = mean(cindex, na.rm=T), .groups="keep") %>%
  group_by(data) %>%
  mutate(cindex_avg = mean(cindex)) %>% 
  ungroup() %>%
  left_join(
    surv_risk_classifyr_cindex %>%
      separate_wider_delim("subset_name", "_", names=c("data","subgroup"))%>% 
      filter(!(data %in% c("RNA-Seq","RNA-Seq-STgenes"))) %>%
      group_by(data, subgroup) %>%
      summarise(cindex=mean(cindex,na.rm=T), .groups="drop") %>% 
      group_by(subgroup) %>% 
      mutate(max_ind = ifelse(cindex == max(cindex), "*","")) %>%
      select(data, subgroup, max_ind),
    by=c("data"="data","subgroup"="subgroup")
  ) %>%
  ungroup() %>% 
  mutate(data = factor(data, levels = c("THItoGene", "TCGN", "EGNv2","EGNv1", "Hist2ST", "DeepPT", "DeepSpaCE", "GeneCodeR", "HisToGene", "ST-Net", "RNA-Seq" ,"RNA-Seq-STgenes"))) %>% 
  mutate(data = as.factor(data)) %>% 
  mutate(subgroup=case_when(
    subgroup=="her2" ~ "HER2+",
    subgroup=="tnbc" ~ "TNBC",
    subgroup=="luminal" ~ "Luminal",
    TRUE~ ""
  )) %>% 
  ggplot() +
  aes(x=data,y=cindex, colour=data, fill=data)+
  geom_boxplot(alpha=.5) +
  facet_wrap(~subgroup, nrow=2, scales="free_x")+
  # scale_y_continuous(limits = c(0, NA), expand = c(0,0))+
  scale_fill_manual(values=dl_method_pal)+
  scale_colour_manual(values=dl_method_pal)+
  coord_flip()+
  labs(#title="C-index over each BC subgroup",
       #subtitle="Models fit using top 10 genes within each data subset",
       x="",y="C-index")+
  th +
  theme(legend.position = "none")

p_cind_boxplot

Figure 4i

Kaplan-Meier curves constructed using the average risk prediction from a 3-fold CV with 100 repeats within the HER2+ breast cancer subtype.

avg_risk_list <- lapply(
  seq_along(surv_eval_dat),
  function(i) {
    surv_eval_dat[[i]]@predictions %>%
      as.data.frame() %>%
      left_join(
        data.frame(
          sample = surv_eval_dat[[i]]@originalNames,
          truth = surv_eval_dat[[i]]@actualOutcome
        ),
        by=c("sample")
      ) %>%
      mutate(subset_name = names(surv_eval_dat)[i])
  } 
) %>%
  bind_rows() %>%
  mutate(truth = as.character(truth)) %>% 
  mutate(time=as.numeric(gsub("\\+","",truth)),
         event=ifelse(grepl("\\+",truth),0,1)) %>%
  group_by(subset_name, sample, truth, time, event) %>% 
  summarise(avg_risk = mean(risk), .groups="drop") %>%
  group_by(subset_name) %>%
  mutate(risk_cat = ifelse(avg_risk >= median(avg_risk),"high-risk","low-risk")) %>%
  split(., f=.$subset_name)

km_list <- lapply(
  seq_along(avg_risk_list),
  function(i) {
    surv_fit = survfit(Surv(time, event) ~ risk_cat,
                       data = avg_risk_list[[i]])
    
    p_km_surv <- ggsurvplot(
      fit = surv_fit,
      data = avg_risk_list[[i]],
      risk.table = FALSE,
      pval = TRUE,
      pval.size = 4,
      legend.title = "Risk Group",
      xlab = "Time (days)",
      ylab = "Survival Probability",
      title = "Kaplan-Meier Survival Curves by Risk Group"
    )
    
    # calculate logrank pvalue
    p_value <- surv_pvalue(surv_fit(
      Surv(time, event) ~ risk_cat,
      data = avg_risk_list[[i]]
    ))
  
    logrank_test <- survdiff(Surv(time, event) ~
                               risk_cat,
                             data = avg_risk_list[[i]])
    
    return(
      p_km_surv$data.survplot %>%
        mutate(pval.txt = p_value$pval.txt) %>%
        mutate(pval = p_value$pval) %>%
        mutate(subgroup = names(avg_risk_list)[i])
      )
  } 
)

# extract plot data from ggsurvplot
km_surv_plot_dat_df <- km_list %>%
  bind_rows() %>%
  separate(subgroup, c("data", "subgroup"), sep = "_") %>%
  mutate(
    subgroup = case_when(
      subgroup == "her2" ~ "HER2+",
      subgroup == "tnbc" ~ "TNBC",
      subgroup == "luminal" ~ "Luminal",
      TRUE ~ ""
    ) %>%
      factor(., levels = c("HER2+", "TNBC", "Luminal"))
  )

th <-   theme(text=element_text(size=16),
                axis.text.x = element_text(angle = 0, hjust = 0.5),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.background = element_rect(colour = "black", size=0.7, fill=NA) )

fig_4i_cv <- km_surv_plot_dat_df %>%
  filter(subgroup=="HER2+" & data!="RNA-Seq") %>%
  mutate(
    data = factor(data,
                  levels=c("RNA-Seq-STgenes", 
                           "ST-Net", 
                           "HisToGene", 
                           "GeneCodeR", 
                           "DeepSpaCE", 
                           "DeepPT", 
                           "Hist2ST", 
                           "EGNv1", 
                           "EGNv2", 
                           "TCGN", 
                           "THItoGene")
                  )
  ) %>%
  ggplot(aes(x=time/365,y=surv, col=risk_cat))+
  geom_step(linewidth = 1, linetype = 1)+
  geom_point(size = 4.5, shape = "+")+
  geom_text(aes(label=pval.txt, x=-Inf,y=-Inf),
            vjust=-1,hjust=-.2,
            colour="grey60", check_overlap = TRUE)+
  scale_y_continuous(limits = c(0, 1.05), expand=c(0,0))+
  scale_x_continuous(expand=c(0,0))+
  ggh4x::facet_grid2(rows=vars(subgroup),cols=vars(data), 
                     scales="free", independent = "all",switch="y")+
  theme(legend.position="none",
        strip.placement = "outside",
        strip.switch.pad.grid = unit(1, "cm"),
        axis.title.y = element_text(vjust = -15))+
  th+
  labs(x="Time (years)",y="Survival probability",col="Risk Category")

fig_4i_cv
## Warning: The `scale_name` argument of `discrete_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Supplementary Figure 16
fig_4e_cv_tnbc_luminal <- km_surv_plot_dat_df %>%
  filter(subgroup!="HER2+" & data!="RNA-Seq") %>%
  mutate(
    data = factor(data,
                  levels=c("RNA-Seq-STgenes", 
                           "ST-Net", 
                           "HisToGene", 
                           "GeneCodeR", 
                           "DeepSpaCE", 
                           "DeepPT", 
                           "Hist2ST", 
                           "EGNv1", 
                           "EGNv2",
                           "TCGN", 
                           "THItoGene")
                  )
  ) %>%
  ggplot(aes(x=time/365,y=surv, col=risk_cat))+
  geom_step(linewidth = 1, linetype = 1)+
  geom_point(size = 4.5, shape = "+")+
  geom_text(aes(label=pval.txt, x=-Inf,y=-Inf),
            vjust=-1,hjust=-.2,
            colour="grey60", check_overlap = TRUE)+
  scale_y_continuous(limits = c(0, 1.05), expand=c(0,0))+
  scale_x_continuous(expand=c(0,0))+
  ggh4x::facet_grid2(rows=vars(subgroup),cols=vars(data), 
                     scales="free", independent = "all",switch="y")+
  theme(legend.position="bottom",
        strip.placement = "outside",
        strip.switch.pad.grid = unit(1, "cm"),
        axis.title.y = element_text(vjust = -15))+
  th +
  labs(x="Time (years)",y="Survival probability",col="Risk Category")

fig_4e_cv_tnbc_luminal

Without CV

Figure 4f-h

Kaplan-Meier curves for patients split into high and low risk groups by the median risk prediction of the multivariate cox regression models for each method and (f) HER2+, (g) luminal and (h) TNBC breast cancer subtypes. Models were trained on all patients and then the predictions of the training patients were used. The p-value represents the result of the logrank test for assessing the statistical significance of differences in survival between the groups.

calcSurvRisk2 <- function(surv_data, selected_genes) {
  # Prepare the input data for Lasso regression
  X <- surv_data[, selected_genes, drop = FALSE]
  Y <- surv_data[, c("days_to_last_follow_up", "vital_status")]
  
  # Fit a multivariate Cox regression model
  multivariate_cox_model <- coxph(Surv(Y$days_to_last_follow_up, Y$vital_status) ~ .,
                                  data = X)
  
  # Calculate the risk scores for each patient
  patient_risk <- coef(multivariate_cox_model) %*% t(as.matrix(X))
  # Calculate the median risk score
  median_pat_risk <- median(patient_risk)
  
  pat_risk_df <- patient_risk %>% 
    t() %>% 
    as.data.frame() %>%
    tibble::rownames_to_column("rna_id") %>%
    rename(risk=V1) %>% 
    mutate(risk_cat = ifelse(risk >= median_pat_risk,"high-risk","low-risk")) %>%
    bind_cols(Y)
  
  surv_obj <- Surv(pat_risk_df$days_to_last_follow_up, pat_risk_df$vital_status)
  
  surv_fit = survfit(
    Surv(days_to_last_follow_up, vital_status) ~ risk_cat,
    data = pat_risk_df
    )
  
  p_km_surv <- ggsurvplot(
    fit = surv_fit,
    data = pat_risk_df,
    risk.table = FALSE,
    pval = TRUE,
    pval.size=4,
    legend.title = "Risk Group",
    xlab = "Time (years)",
    ylab = "Survival Probability",
    title = "Kaplan-Meier Survival Curves by Risk Group"
  )
  
  logrank_test <- survdiff(
    Surv(days_to_last_follow_up, vital_status) ~
      risk_cat, 
    data = pat_risk_df)
  
  return(list(
    pat_risk_df=pat_risk_df,
    p_km_surv=p_km_surv,
    logrank_test=logrank_test,
    cox_model=multivariate_cox_model
  ))
}

surv_risk_list_top5feat <- lapply(
  seq_along(surv_dat_list),
  function(i) {
    calcSurvRisk2(
      surv_data=surv_dat_list[[i]],
      selected_genes=surv_risk_list[[i]]$p_value_df %>% 
        arrange(p_value) %>% 
        slice(1:5) %>% 
        pull(gene)
      )
  }) %>%
  `names<-`(names(surv_dat_list))

# Plot survival curves
km_surv_plot_dat_df <- lapply(which(!(names(surv_risk_list_top5feat) %in% c("RNA-Seq_luminal",
                                                                 "RNA-Seq_her2",
                                                                 "RNA-Seq_tnbc"))),
                  function(i) {
                    # calculate logrank pvalue
                    p_value <- surv_pvalue(
                      surv_fit(
                        Surv(days_to_last_follow_up, vital_status) ~ risk_cat, 
                        data = surv_risk_list_top5feat[[i]]$pat_risk_df
                        )
                      )
                    # extract plot data from ggsurvplot
                    surv_risk_list_top5feat[[i]]$p_km_surv$data.survplot %>%
                      mutate(pval.txt = sprintf("p = %.2g", p_value$pval)) %>%
                      mutate(pval=p_value$pval) %>%
                      mutate(subgroup = names(surv_risk_list_top5feat)[i])
       }) %>%
  bind_rows() %>%
  separate(subgroup, c("data","subgroup"), sep="_") %>%
  mutate(subgroup=case_when(
    subgroup=="her2" ~ "HER2+",
    subgroup=="tnbc" ~ "TNBC",
    subgroup=="luminal" ~ "Luminal",
    TRUE~ ""
  ) %>% 
    factor(., levels=c("HER2+","TNBC","Luminal")))

fig_4f_h <- km_surv_plot_dat_df %>%
  mutate(
    data = factor(data,
                  levels=c("RNA-Seq-STgenes", km_surv_plot_dat_df %>% 
                             distinct(pval,data,subgroup) %>%
                             filter(data != "RNA-Seq-STgenes") %>% 
                             group_by(data) %>% 
                             summarise(mean_pval = mean(pval)) %>%
                             arrange(mean_pval) %>% 
                             pull(data)))
  ) %>%
    mutate(
    data = factor(data,
                  levels=c("RNA-Seq-STgenes", 
                           "ST-Net", 
                           "HisToGene", 
                           "GeneCodeR", 
                           "DeepSpaCE", 
                           "DeepPT", 
                           "Hist2ST", 
                           "EGNv1", 
                           "EGNv2",
                           "TCGN", 
                           "THItoGene")
                  )
  ) %>%
  ggplot(aes(x=time/365,y=surv, col=risk_cat))+
  geom_step(linewidth = 1, linetype = 1)+
  geom_point(size = 4.5, shape = "+")+
  geom_text(aes(label=pval.txt, x=-Inf,y=-Inf),
            vjust=-1,hjust=-.2,
            colour="grey60", check_overlap = TRUE)+
  scale_y_continuous(limits = c(0, 1.05), expand=c(0,0))+
  scale_x_continuous(expand=c(0,0))+
  ggh4x::facet_grid2(rows=vars(subgroup),cols=vars(data), 
                     scales="free", independent = "all",switch="y")+
  theme(legend.position="bottom",
        strip.placement = "outside",
        strip.switch.pad.grid = unit(1, "cm"),
        axis.title.y = element_text(vjust = -15))+
  th+
  labs(x="Time (years)",y="Survival probability",col="Risk Category")

fig_4f_h

Supplementary Figure 15
#old version c-index
risk_cat_p_df_top5feat <- lapply(
  seq_along(surv_risk_list_top5feat),
  function(i) {
    data.frame(
      task_id = names(surv_risk_list_top5feat)[i],
      logrank_p = surv_risk_list_top5feat[[i]]$logrank_test$pvalue,
      cindex = survival::concordance(surv_risk_list_top5feat[[i]]$cox_model)$concordance
    )
  }
) %>%
  bind_rows() %>%
  separate(task_id, c("data","subgroup"), sep="_")

risk_cat_p_df_top5feat <- risk_cat_p_df_top5feat%>%
  arrange(factor(data, levels=c("proposed", "THItoGene", "TCGN", "EGNv2", "EGNv1", "Hist2ST", "DeepPT",   "eepSpaCE", "GeneCodeR","HisToGene","ST-Net", "RNA-Seq-STgenes", "RNA-Seq")))

risk_cat_p_df_top5feat$data <- factor(risk_cat_p_df_top5feat$data, levels = unique(risk_cat_p_df_top5feat$data))

# Dottlot of c-index of coxph fits
fig_4bcd <- risk_cat_p_df_top5feat %>% 
  group_by(data) %>%
  mutate(mean_cindex = mean(cindex)) %>%
  ungroup() %>%
  left_join(
    risk_cat_p_df_top5feat %>% 
      filter(!(data %in% c("RNA-Seq","RNA-Seq-STgenes"))) %>%
      group_by(subgroup) %>% 
      mutate(max_ind = ifelse(cindex == max(cindex), "*","")) %>%
      select(data, subgroup, max_ind),
    by=c("data"="data","subgroup"="subgroup")
  ) %>%
  mutate(data = as.factor(data)) %>% 
  mutate(subgroup=case_when(
    subgroup=="her2" ~ "HER2+",
    subgroup=="tnbc" ~ "TNBC",
    subgroup=="luminal" ~ "Luminal",
    TRUE~ ""
  )) %>% 
  
  ggplot() +
  aes(x=data,y=cindex, colour=data, fill=data,
      group=subgroup)+
  geom_bar(stat="identity", alpha=.65,width=.85)+
  geom_text(aes(label = max_ind), 
            hjust=1.75, vjust = 0.78, size=5,
            colour = "white")+
  facet_wrap(~subgroup, nrow=2, scales="free_x")+
  scale_y_continuous(limits = c(0, NA), expand = c(0,0))+
  scale_fill_manual(values=dl_method_pal)+
  scale_colour_manual(values=dl_method_pal)+
  coord_flip()+
  labs(x="",y="C-index")+
  theme(legend.position = "none")

fig_4bcd
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_text()`).

Rank methods

Pseudobulk

# Average each metric over each method and rank
pred_rank_gene_df <- tcga_pred_m_df %>%
  ungroup() %>%
  # Inf values in NRMSE, remove them
  mutate_at(vars(c(nrmse_range, nrmse_sd)),
            function(col) ifelse(col == Inf, NaN, col)) %>%
  # Average over each model
  group_by(model_id) %>%
  summarise_if(is.numeric, function(col) mean(col,na.rm = T)) %>%
  mutate(cor_pearson_gene_r = rank(-cor_pearson),
         nrmse_gene_r = rank(nrmse_range),
         js_div_gene_r = rank(js_div),
         mi_gene_r = rank(-mi)) %>%
  rowwise() %>%
  mutate(mean_gene_rank = mean(c(cor_pearson_gene_r, nrmse_gene_r, js_div_gene_r, mi_gene_r))) %>%
  ungroup() %>% 
  arrange(mean_gene_rank) %>%
  mutate(model_id = factor(model_id, levels=unique(model_id))) %>%
  pivot_longer(
    cols=c("cor_pearson_gene_r", "nrmse_gene_r",
           "js_div_gene_r", "mi_gene_r", "mean_gene_rank"),
    names_to="metric_rank",
    values_to = "rank"
  ) %>%
  mutate(metric_rank = factor(metric_rank,
                              levels=c("cor_pearson_gene_r", "nrmse_gene_r",
                                       "js_div_gene_r", "mi_gene_r", "mean_gene_rank")))

pred_rank_pat_df <- tcga_pred_pat_m_df %>%
  ungroup() %>%
  # Inf values in NRMSE, remove them
  mutate_at(vars(c(nrmse_range, nrmse_sd)),
            function(col) ifelse(col == Inf, NaN, col)) %>%
  # Average over each model
  group_by(model_id) %>%
  summarise_if(is.numeric, function(col) mean(col,na.rm = T)) %>%
  mutate(cor_pearson_pat_r = rank(-cor_pearson),
         nrmse_pat_r = rank(nrmse_range),
         js_div_pat_r = rank(js_div),
         mi_pat_r = rank(-mi)) %>%
  rowwise() %>%
  mutate(mean_pat_rank = mean(c(cor_pearson_pat_r, nrmse_pat_r, js_div_pat_r, mi_pat_r))) %>%
  ungroup() %>% 
  arrange(mean_pat_rank) %>%
  mutate(model_id = factor(model_id, levels=unique(model_id))) %>%
  pivot_longer(
    cols=c("cor_pearson_pat_r", "nrmse_pat_r",
           "js_div_pat_r", "mi_pat_r", "mean_pat_rank"),
    names_to="metric_rank",
    values_to = "rank"
  ) %>%
  mutate(metric_rank = factor(metric_rank,
                              levels=c("cor_pearson_pat_r", "nrmse_pat_r",
                                       "js_div_pat_r", "mi_pat_r", "mean_pat_rank")))

pred_rank_df <- pred_rank_pat_df %>%
  select(model_id, metric_rank, rank) %>%
  bind_rows(
    pred_rank_gene_df %>%
      select(model_id, metric_rank, rank)
  )

saveRDS(pred_rank_df,
        file="data/processed/her2st/pred_rank_df_tcgabrca.rds")

Clinical Translation

surv_cv_avg_df <- surv_risk_classifyr_cindex %>% 
  separate_wider_delim("subset_name", "_", names=c("data","subgroup")) %>%
  filter(!(data %in% c("RNA-Seq","RNA-Seq-STgenes"))) %>%
  group_by(data, permutation, subgroup) %>%
  summarise(cindex = mean(cindex, na.rm=T), .groups="drop") %>%
  group_by(data, subgroup) %>%
  summarise(cindex = mean(cindex), .groups="drop") %>% 
  group_by(data) %>%
  summarise(cindex = mean(cindex), .groups="drop") %>%
  mutate(cindex_cv_r = rank(-cindex)) %>%
  left_join(
    km_surv_plot_dat_df %>%
      filter(!(data %in% c("RNA-Seq","RNA-Seq-STgenes"))) %>%
      distinct(pval, data, subgroup) %>%
      group_by(data) %>%
      summarise(surv_diff_cv = mean(pval)) %>%
      mutate(surv_diff_cv_r = rank(surv_diff_cv)),
    by="data"
  ) %>%
  select(data, cindex_cv_r, surv_diff_cv_r)

clin_trans_rank_df <- risk_cat_p_df_top5feat %>% 
  filter(!(data %in% c("RNA-Seq","RNA-Seq-STgenes"))) %>%
  group_by(data) %>%
  summarise(cindex = mean(cindex),
            pval = mean(logrank_p)) %>%
  mutate(cindex_r = rank(-cindex),
         surv_diff_r = rank(pval)) %>%
  select(-cindex, -pval) %>%
  left_join(
    surv_cv_avg_df,
    by="data"
  ) %>%
  rowwise() %>%
  mutate(mean_clin_r = mean(c(cindex_r, surv_diff_r,
                              cindex_cv_r, surv_diff_cv_r))) %>%
  ungroup() %>%
  pivot_longer(cols=!c("data"),
               values_to="rank", names_to="metric_rank")

saveRDS(clin_trans_rank_df,
        file="./data/processed/her2st/pred_rank_df_tcga_clinicaltrans.rds")

Session Info

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Debian GNU/Linux 12 (bookworm)
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.21.so;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Australia/Sydney
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pROC_1.18.5                 infotheo_1.2.0.1           
##  [3] ggthemr_1.1.0               survminer_0.5.0            
##  [5] ggpubr_0.6.0                ggplot2_3.5.1              
##  [7] ClassifyR_3.8.5             survival_3.7-0             
##  [9] BiocParallel_1.38.0         MultiAssayExperiment_1.30.3
## [11] generics_0.1.3              glmnet_4.1-8               
## [13] Matrix_1.7-0                cutpointr_1.2.0            
## [15] scran_1.32.0                scuttle_1.14.0             
## [17] circlize_0.4.16             ComplexHeatmap_2.20.0      
## [19] cowplot_1.1.3               data.table_1.16.4          
## [21] readxl_1.4.3                stringr_1.5.1              
## [23] tidyr_1.3.1                 janitor_2.2.0              
## [25] magrittr_2.0.3              dplyr_1.1.4                
## [27] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [29] Biobase_2.64.0              GenomicRanges_1.56.0       
## [31] GenomeInfoDb_1.40.1         IRanges_2.38.0             
## [33] S4Vectors_0.42.0            BiocGenerics_0.50.0        
## [35] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3        rstudioapi_0.16.0        
##   [3] jsonlite_1.8.8            shape_1.4.6.1            
##   [5] farver_2.1.2              rmarkdown_2.27           
##   [7] GlobalOptions_0.1.2       zlibbioc_1.50.0          
##   [9] vctrs_0.6.5               DelayedMatrixStats_1.26.0
##  [11] rstatix_0.7.2             htmltools_0.5.8.1        
##  [13] S4Arrays_1.4.1            BiocNeighbors_1.22.0     
##  [15] broom_1.0.7               cellranger_1.1.0         
##  [17] SparseArray_1.4.8         Formula_1.2-5            
##  [19] sass_0.4.9                bslib_0.7.0              
##  [21] plyr_1.8.9                zoo_1.8-12               
##  [23] lubridate_1.9.3           cachem_1.1.0             
##  [25] igraph_2.0.3              lifecycle_1.0.4          
##  [27] iterators_1.0.14          pkgconfig_2.0.3          
##  [29] rsvd_1.0.5                R6_2.5.1                 
##  [31] fastmap_1.2.0             GenomeInfoDbData_1.2.12  
##  [33] snakecase_0.11.1          clue_0.3-65              
##  [35] digest_0.6.35             colorspace_2.1-0         
##  [37] dqrng_0.4.0               irlba_2.3.5.1            
##  [39] beachmat_2.20.0           labeling_0.4.3           
##  [41] philentropy_0.8.0         fansi_1.0.6              
##  [43] km.ci_0.5-6               timechange_0.3.0         
##  [45] mgcv_1.9-1                httr_1.4.7               
##  [47] abind_1.4-5               compiler_4.4.1           
##  [49] withr_3.0.0               doParallel_1.0.17        
##  [51] backports_1.5.0           carData_3.0-5            
##  [53] highr_0.11                ggupset_0.4.0            
##  [55] ggsignif_0.6.4            DelayedArray_0.30.1      
##  [57] rjson_0.2.21              bluster_1.14.0           
##  [59] tools_4.4.1               glue_1.7.0               
##  [61] nlme_3.1-165              cluster_2.1.6            
##  [63] reshape2_1.4.4            gtable_0.3.5             
##  [65] KMsurv_0.1-5              BiocSingular_1.20.0      
##  [67] ScaledMatrix_1.12.0       metapod_1.12.0           
##  [69] car_3.1-3                 utf8_1.2.4               
##  [71] XVector_0.44.0            foreach_1.5.2            
##  [73] pillar_1.9.0              limma_3.60.2             
##  [75] splines_4.4.1             lattice_0.22-6           
##  [77] tidyselect_1.2.1          locfit_1.5-9.9           
##  [79] knitr_1.46                gridExtra_2.3            
##  [81] edgeR_4.2.0               xfun_0.44                
##  [83] statmod_1.5.0             stringi_1.8.4            
##  [85] UCSC.utils_1.0.0          yaml_2.3.8               
##  [87] evaluate_0.23             codetools_0.2-20         
##  [89] tibble_3.2.1              cli_3.6.2                
##  [91] xtable_1.8-4              munsell_0.5.1            
##  [93] jquerylib_0.1.4           survMisc_0.5.6           
##  [95] Rcpp_1.0.12               png_0.1-8                
##  [97] parallel_4.4.1            ggh4x_0.3.0              
##  [99] sparseMatrixStats_1.16.0  scales_1.3.0             
## [101] purrr_1.0.2               crayon_1.5.2             
## [103] GetoptLong_1.0.5          rlang_1.1.3